Clean up the Data

library(readr)
library(dplyr)
library(stringr)

df <- read.csv('raw_data/coffee_compare.csv')
coffee <- df %>% select(DBA, reinspections, checks, violations, 
                        score, inspections, BORO, SCORE)
coffee$DBA = ifelse(str_detect(coffee$DBA, "DUNKIN"), 'DD', 'Starbucks')
coffee$DBA %>% table()
## .
##        DD Starbucks 
##       625       346
coffee$BORO %>% table()
## .
##         BRONX      BROOKLYN     MANHATTAN        QUEENS STATEN ISLAND 
##           103           183           412           230            43

Visualization

library(ggthemes)
library(ggplot2)
library(plotly)

coffee_new <- coffee %>% 
    group_by(DBA, BORO) %>% 
    summarize(Value = n())

pc <- ggplot(coffee_new, aes(fill=DBA, y=Value, x=BORO)) + 
    geom_bar(position="dodge", stat="identity", width = 0.5) +
    xlab('Neighborhood') + 
    ylab('Health Violations') + 
    labs(caption = 'Data Source: DOHMH',
         fill = 'Brands') +
    ggtitle('Health Violations of Coffee Brands by Neighborhoods') +
    theme_bw() +
    scale_fill_manual(values = c("#f09a56", "#87dc97")) +
    theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
          legend.text = element_text(size=8),
          legend.title = element_text(size=8))
pc

pc1 <- ggplot(coffee, aes(x = BORO, y = violations, color = DBA)) + 
    geom_point(alpha = 0.5) +
    xlab('Neighborhood') + 
    ylab('Health Violations') + 
    labs(caption = 'DOHMH') +
    ggtitle('Health Violations of Coffee Brands by Neighborhoods') +
    theme_bw() +
    theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
          legend.text = element_text(size=8),
          legend.title = element_text(size=8)) +
    scale_color_manual(values = c("DD" = "#f09a56", 
                                  'Starbucks' = "#87dc97"))
ggplotly(pc1)
pc2 <- ggplot(coffee, aes(x = BORO, y = SCORE, color = DBA)) + 
    geom_point(alpha = 0.5) +
    xlab('Neighborhood') + 
    ylab('Total Score for a Particular Inspection') + 
    labs(caption = 'DOHMH') +
    ggtitle('Score of Coffee Brands by Neighborhoods') +
    theme_bw() +
    theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
          legend.text = element_text(size=8),
          legend.title = element_text(size=8)) +
    scale_color_manual(values = c("DD" = "#f09a56", 
                                  'Starbucks' = "#87dc97"))
ggplotly(pc2)

Supervised Machine Learning

Question: Whether can be told the coffee store is Starbucks or not?

Binary outcome in this case.

library(caret)

coffee$DBA = ifelse(coffee$DBA == "DD", 0, 1)
coffee$DBA <- factor(coffee$DBA, 
                     labels = c("Starbucks", "DD"), 
                     levels = 1:0) 
set.seed(12345)
in_train <- createDataPartition(y = coffee$DBA, 
                                p = 0.8, list = FALSE)
training <- coffee[ in_train, ]
testing  <- coffee[-in_train, ]
logit <- glm(DBA ~ checks + violations + score + BORO, 
             data = training, family = binomial(link = "logit"))

y_hat_logit <- predict(logit, newdata = testing, type = "response")
z_logit <- factor(y_hat_logit > 0.5, 
                  levels = c(TRUE, FALSE), 
                  labels = c("Starbucks", "DD"))

confusionMatrix(z_logit, reference = testing$DBA)
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Starbucks  DD
##   Starbucks        34 101
##   DD               35  24
##                                           
##                Accuracy : 0.299           
##                  95% CI : (0.2355, 0.3687)
##     No Information Rate : 0.6443          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.2596         
##                                           
##  Mcnemar's Test P-Value : 2.494e-08       
##                                           
##             Sensitivity : 0.4928          
##             Specificity : 0.1920          
##          Pos Pred Value : 0.2519          
##          Neg Pred Value : 0.4068          
##              Prevalence : 0.3557          
##          Detection Rate : 0.1753          
##    Detection Prevalence : 0.6959          
##       Balanced Accuracy : 0.3424          
##                                           
##        'Positive' Class : Starbucks       
## 
LDA <- train(DBA ~ checks + violations + score + BORO, 
             data = training, method = "lda", 
             reProcess = c("center", "scale"))
z <- predict(LDA, newdata = testing)
confusionMatrix(z, testing$DBA)
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Starbucks DD
##   Starbucks        38 26
##   DD               31 99
##                                           
##                Accuracy : 0.7062          
##                  95% CI : (0.6367, 0.7693)
##     No Information Rate : 0.6443          
##     P-Value [Acc > NIR] : 0.04083         
##                                           
##                   Kappa : 0.3484          
##                                           
##  Mcnemar's Test P-Value : 0.59624         
##                                           
##             Sensitivity : 0.5507          
##             Specificity : 0.7920          
##          Pos Pred Value : 0.5938          
##          Neg Pred Value : 0.7615          
##              Prevalence : 0.3557          
##          Detection Rate : 0.1959          
##    Detection Prevalence : 0.3299          
##       Balanced Accuracy : 0.6714          
##                                           
##        'Positive' Class : Starbucks       
##